library(tidyverse)
library(plotly)
library(sf)
library(mapview)
library(tigris)
library(censusapi)
library(leaflet)
library(lehdr)
library(usmap)
library(lmtest)
library(pracma)
library(lmtest)
library(forecast)
library(vars)
library(rvest)
library(RSelenium)
library(seleniumPipes)
library(dLagM)
library(jsonlite)
library(rgdal)
library(esri2sf)
library(readr)

options(
  tigris_class = "sf",
  tigris_use_cache = TRUE
)

Sys.setenv(CENSUS_KEY="10dcd73d7c043e91bac9fb8d3989cbff54b08790")

Zip code level analysis

Santa Clara County and San Francisco zip code cases

scc_map_cases <- esri2sf("https://services2.arcgis.com/RiZWfy7B1r76pKTz/arcgis/rest/services/COVID_zip_FC/FeatureServer/0")
## [1] "Feature Layer"
## [1] "esriGeometryPolygon"
sf_place_cases <- 
  read_csv("https://raw.githubusercontent.com/datadesk/california-coronavirus-data/master/latimes-place-totals.csv") %>%
  filter(county == 'San Francisco')

Get social distancing data

scc_blockgroups <-
  block_groups("CA","Santa Clara", cb=F, progress_bar=F) %>% 
  st_transform('+proj=longlat +datum=WGS84')

sf_blockgroups <- 
  block_groups("CA","San Francisco", cb=F, progress_bar=F) %>% 
  st_transform('+proj=longlat +datum=WGS84')

# zip code areas
zctas_94 <- 
  zctas(cb = F, starts_with = "94") %>% 
  st_transform('+proj=longlat +datum=WGS84') %>%
  dplyr::select(ZCTA5CE10, geometry)

zctas_95 <- 
  zctas(cb = F, starts_with = "95") %>% 
  st_transform('+proj=longlat +datum=WGS84') %>%
  dplyr::select(ZCTA5CE10, geometry)

zctas_94_95 <- rbind(zctas_94, zctas_95)

# join with block groups
scc_bg_zctas <- scc_blockgroups %>% 
  st_centroid() %>%
  st_join(zctas_94_95) %>% 
  dplyr::select(GEOID, ZCTA5CE10) %>%
  rename(blockgroup = GEOID, zipcode = ZCTA5CE10)

sf_bg_zctas <- sf_blockgroups %>% 
  st_centroid() %>%
  st_join(zctas_94) %>% 
  dplyr::select(GEOID, ZCTA5CE10) %>%
  rename(blockgroup = GEOID, zipcode = ZCTA5CE10)

bay_sd <- readRDS("/Users/simonespeizer/pCloud Drive/Shared/SFBI/Restricted Data Library/Safegraph/covid19analysis/bay_socialdistancing_v2.rds") %>% 
  mutate(date = date_range_start %>%  substr(1,10) %>% as.Date())

# obtaining weekends
weekends <- bay_sd %>% 
  filter(!duplicated(date)) %>% 
  arrange(date) %>% 
  mutate(weekend = ifelse((date %>% as.numeric()) %% 7 %in% c(2,3), T, F)) %>% 
  dplyr::select(date,weekend)

bay_sd <- bay_sd %>% left_join(weekends)

SF zip code over time analysis

Over time cases by zip code

First just plot the cases over time by zip code

# obtain the saved census data 
acs_vars = readRDS("/Users/simonespeizer/Documents/2020 Spring Quarter/CEE 218Z/censusData2018_acs_acs5.rds")

# define a function for pulling census data
pullCensus <- function(variableToPull, county) {
    regionString <- paste0("state:06+county:", county)
    censusData <- getCensus(
      name = "acs/acs5",
      vintage = 2018,
      region = "block group:*", 
      regionin = regionString,
      vars = variableToPull
    ) %>%
    mutate(blockgroup = paste0(state,county,tract,block_group)) %>%
      select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME"))
  
  return(censusData)
}


# get population data for San Francisco
sf_fips <- fips("CA", "San Francisco") %>% substr(3,5)

sf_pop_zip <- pullCensus("B01003_001E", sf_fips) %>% 
  left_join(sf_bg_zctas %>% dplyr::select(blockgroup, zipcode)) %>% 
  group_by(zipcode) %>%
  summarize(total_pop_zip = sum(B01003_001E))

sf_cases_zip <- sf_place_cases %>% left_join(sf_pop_zip, by = c("place" = "zipcode")) %>%
  mutate(cases_by_pop = confirmed_cases / total_pop_zip) %>%
  rename(zipcode = place)

sf_cases_zip %>% ggplot(
  aes(x = date)) +
    geom_line(aes(y = cases_by_pop, color=zipcode))

Something weird is going on with case counts on May 7th and 11th, so I remove those dates manually. Also adjust the day in zip code 94117 that is off.

sf_cases_zip_edit <- sf_cases_zip %>% filter(date != "2020-05-07" & date != "2020-05-11") %>% mutate(confirmed_cases = ifelse(zipcode == "94117" & confirmed_cases == 1, confirmed_cases + 37, confirmed_cases), cases_by_pop = confirmed_cases / total_pop_zip)

sf_cases_zip_edit %>% ggplot(
  aes(x = date)) +
    geom_line(aes(y = cases_by_pop, color=zipcode))

sf_cases_zip_edit %>% ggplot(
  aes(x = date)) +
    geom_line(aes(y = confirmed_cases, color=zipcode))

Bucket by movement right before shelter in place

Get social distancing data by zip code for the first few days before shelter-in-place

# relevant dates
pre_sd_date_cutoff <- as.Date("2020-03-01")
shelter_date <- as.Date("2020-03-17") # note that this is the day the order went into effect

# get daily social distancing data for SF by date
sf_sd_zip_by_date <- bay_sd %>%
  filter(origin_census_block_group %in% sf_blockgroups$GEOID) %>%
  left_join(sf_bg_zctas %>% dplyr::select(blockgroup, zipcode), by = c("origin_census_block_group" = "blockgroup")) %>%
  group_by(zipcode, date) %>%
  summarize(total_at_home_zip = sum(completely_home_device_count), total_devices = sum(device_count)) %>%
  ungroup()

# get block group averages leaving home before shelter in place/COVID-19
sf_pre_sd_leaving_avg <- bay_sd %>%
  filter(origin_census_block_group %in% sf_blockgroups$GEOID) %>%
  filter(date <= pre_sd_date_cutoff)  %>%
  left_join(sf_bg_zctas %>% dplyr::select(blockgroup, zipcode), by = c("origin_census_block_group" = "blockgroup")) %>%
  group_by(zipcode) %>%
  summarize(total_at_home_zip = sum(completely_home_device_count), total_devices = sum(device_count)) %>%
  mutate(
    percent_at_home_pre = total_at_home_zip*100/total_devices,
    percent_leaving_home_pre = (100 - percent_at_home_pre)
  )

# get average percent leaving the home for the 3 days before shelter in place
sf_sd_zip_3_days_pre <- sf_sd_zip_by_date %>%
  filter(date < shelter_date & date >= (shelter_date - 3)) %>%
  group_by(zipcode) %>%
  summarize(total_at_home_zip = sum(total_at_home_zip), total_devices = sum(total_devices)) %>% 
  mutate(percent_at_home = total_at_home_zip*100/total_devices, percent_leaving_home = (100 - percent_at_home)) %>% 
  left_join(sf_pre_sd_leaving_avg %>% dplyr::select(zipcode, percent_leaving_home_pre)) %>%
  mutate(dif_percent_leaving = percent_leaving_home - percent_leaving_home_pre)
  
sf_sd_zip_3_days_pre_buckets <- sf_sd_zip_3_days_pre %>% 
  mutate(bucketed_percent_leaving = ifelse(percent_leaving_home < 65, "65% or below", ifelse(percent_leaving_home < 70, "65% to 70%", "70% or more")), bucketed_dif_percent_leaving = ifelse(dif_percent_leaving > -5, "-5% or less", ifelse(dif_percent_leaving > -10, "-5% to -10%", ifelse(dif_percent_leaving > -15, "-10% to -15%", "-15% or below"))))

# join with the case data
sf_cases_sd_3dayspre_bucketed_abs <- left_join(sf_cases_zip_edit, sf_sd_zip_3_days_pre_buckets) %>%
  group_by(date, bucketed_percent_leaving) %>%
  summarize(total_cases_grouped = sum(confirmed_cases), total_pop_combined = sum(total_pop_zip)) %>%
  mutate(cases_by_pop_grouped = total_cases_grouped / total_pop_combined)
  
# plot
sf_cases_sd_3dayspre_bucketed_abs %>% filter(!is.na(bucketed_percent_leaving)) %>% ggplot(aes(x = date)) + 
  geom_line(aes(y = cases_by_pop_grouped, color=bucketed_percent_leaving)) + 
  ggtitle('Cases per capita for zipcodes grouped by % leaving in the 3 days before SIP') + labs(y = "Cases per capita", x = "Date", color = "% leaving home")

# try with differences in percent leaving
sf_cases_sd_3dayspre_bucketed_rel <- left_join(sf_cases_zip_edit, sf_sd_zip_3_days_pre_buckets) %>%
  group_by(date, bucketed_dif_percent_leaving) %>%
  summarize(total_cases_grouped = sum(confirmed_cases), total_pop_combined = sum(total_pop_zip)) %>%
  mutate(cases_by_pop_grouped = total_cases_grouped / total_pop_combined)
  
# plot
sf_cases_sd_3dayspre_bucketed_rel %>% filter(!is.na(bucketed_dif_percent_leaving)) %>% ggplot(aes(x = date)) + geom_line(aes(y = cases_by_pop_grouped, color=bucketed_dif_percent_leaving)) + ggtitle('Cases per capita for zipcodes grouped by % leaving in the 3 days before SIP') + labs(y = "Cases per capita", x = "Date", color = "dif in % leaving home relative to pre SIP")

Bucket by movement right after shelter in place

Try now with the 3 days after SIP.

# get average percent leaving the home for the 3 days afterter in place
sf_sd_zip_3_days_post <- sf_sd_zip_by_date %>%
  filter(date >= shelter_date & date < (shelter_date + 3)) %>%
  group_by(zipcode) %>%
  summarize(total_at_home_zip = sum(total_at_home_zip), total_devices = sum(total_devices)) %>% 
  mutate(percent_at_home = total_at_home_zip*100/total_devices, percent_leaving_home = (100 - percent_at_home)) %>% 
  left_join(sf_pre_sd_leaving_avg %>% dplyr::select(zipcode, percent_leaving_home_pre)) %>%
  mutate(dif_percent_leaving = percent_leaving_home - percent_leaving_home_pre)
  

sf_sd_zip_3_days_post %>% ggplot(aes(x = zipcode, y = percent_leaving_home)) + geom_point()

sf_sd_zip_3_days_post %>% ggplot(aes(x = zipcode, y = dif_percent_leaving)) + geom_point()

sf_sd_zip_3_days_post_buckets <- sf_sd_zip_3_days_post %>% 
  mutate(bucketed_percent_leaving = ifelse(percent_leaving_home < 55, "55% or below", ifelse(percent_leaving_home < 60, "55% to 60%", ifelse(percent_leaving_home < 65, "60% to 65%","65% or more"))), bucketed_dif_percent_leaving = ifelse(dif_percent_leaving > -10, "-10% or less", ifelse(dif_percent_leaving > -15, "-10% to -15%", ifelse(dif_percent_leaving > -20, "-15% to -20%", ifelse(dif_percent_leaving > -25, "-20% to -25%", "-25% or below")))))

# join with the case data
sf_cases_sd_3dayspost_bucketed_abs <- left_join(sf_cases_zip_edit, sf_sd_zip_3_days_post_buckets) %>%
  group_by(date, bucketed_percent_leaving) %>%
  summarize(total_cases_grouped = sum(confirmed_cases), total_pop_combined = sum(total_pop_zip)) %>%
  mutate(cases_by_pop_grouped = total_cases_grouped / total_pop_combined)
  
# plot
sf_cases_sd_3dayspost_bucketed_abs %>% filter(!is.na(bucketed_percent_leaving)) %>% ggplot(aes(x = date)) + geom_line(aes(y = cases_by_pop_grouped, color=bucketed_percent_leaving)) + ggtitle('Cases per capita for zipcodes grouped by % leaving in the 3 days on and after SIP') + labs(y = "Cases per capita", x = "Date", color = "% leaving home")

# try with differences in percent leaving
sf_cases_sd_3dayspost_bucketed_rel <- left_join(sf_cases_zip_edit, sf_sd_zip_3_days_post_buckets) %>%
  group_by(date, bucketed_dif_percent_leaving) %>%
  summarize(total_cases_grouped = sum(confirmed_cases), total_pop_combined = sum(total_pop_zip)) %>%
  mutate(cases_by_pop_grouped = total_cases_grouped / total_pop_combined)
  
# plot
sf_cases_sd_3dayspost_bucketed_rel %>% filter(!is.na(bucketed_dif_percent_leaving)) %>% ggplot(aes(x = date)) + geom_line(aes(y = cases_by_pop_grouped, color=bucketed_dif_percent_leaving)) + ggtitle('Cases per capita for zipcodes grouped by % leaving in the 3 days on and after SIP') + labs(y = "Cases per capita", x = "Date", color = "dif in % leaving home relative to pre SIP")

Bucket by movement through April 20th

Try the whole time from SIP through April 20th, the first day that we have case data

# get average percent leaving the home for the 3 days after shelter in place
sf_sd_zip_till420 <- sf_sd_zip_by_date %>%
  filter(date >= shelter_date & date < ("2020-04-20")) %>%
  group_by(zipcode) %>%
  summarize(total_at_home_zip = sum(total_at_home_zip), total_devices = sum(total_devices)) %>% 
  mutate(percent_at_home = total_at_home_zip*100/total_devices, percent_leaving_home = (100 - percent_at_home)) %>% 
  left_join(sf_pre_sd_leaving_avg %>% dplyr::select(zipcode, percent_leaving_home_pre)) %>%
  mutate(dif_percent_leaving = percent_leaving_home - percent_leaving_home_pre)
  

sf_sd_zip_till420 %>% ggplot(aes(x = zipcode, y = percent_leaving_home)) + geom_point()

sf_sd_zip_till420 %>% ggplot(aes(x = zipcode, y = dif_percent_leaving)) + geom_point()

sf_sd_zip_till420_buckets <- sf_sd_zip_3_days_post %>% 
  mutate(bucketed_percent_leaving = ifelse(percent_leaving_home < 50, "50% or below", ifelse(percent_leaving_home < 55, "50% to 55%", ifelse(percent_leaving_home < 60, "55% to 60%", "60% or more"))), bucketed_dif_percent_leaving = ifelse(dif_percent_leaving > -15, "-15% or less", ifelse(dif_percent_leaving > -20, "-15% to -20%", ifelse(dif_percent_leaving > -25, "-20% to -25%", "-25% or below"))))

# join with the case data
sf_cases_sd_till420_bucketed_abs <- left_join(sf_cases_zip_edit, sf_sd_zip_till420_buckets) %>%
  group_by(date, bucketed_percent_leaving) %>%
  summarize(total_cases_grouped = sum(confirmed_cases), total_pop_combined = sum(total_pop_zip)) %>%
  mutate(cases_by_pop_grouped = total_cases_grouped / total_pop_combined)
  
# plot
sf_cases_sd_till420_bucketed_abs %>% filter(!is.na(bucketed_percent_leaving)) %>% ggplot(aes(x = date)) + geom_line(aes(y = cases_by_pop_grouped, color=bucketed_percent_leaving)) + ggtitle('Cases per capita for zipcodes grouped by % leaving since SIP') + labs(y = "Cases per capita", x = "Date", color = "% leaving home")

# scatter plot 
sf_cases_sd_till420_bucketed_abs %>% filter(!is.na(bucketed_percent_leaving)) %>% ggplot(aes(x = bucketed_percent_leaving, y = cases_by_pop_grouped)) + geom_point() + ggtitle('Cases by population through current date, grouped by movement, SF')

# try with differences in percent leaving
sf_cases_sd_till420_bucketed_rel <- left_join(sf_cases_zip_edit, sf_sd_zip_till420_buckets) %>%
  group_by(date, bucketed_dif_percent_leaving) %>%
  summarize(total_cases_grouped = sum(confirmed_cases), total_pop_combined = sum(total_pop_zip)) %>%
  mutate(cases_by_pop_grouped = total_cases_grouped / total_pop_combined)
  
# plot
sf_cases_sd_till420_bucketed_rel %>% filter(!is.na(bucketed_dif_percent_leaving)) %>% ggplot(aes(x = date)) + geom_line(aes(y = cases_by_pop_grouped, color=bucketed_dif_percent_leaving)) + ggtitle('Cases per capita for zipcodes grouped by % leaving since SIP') + labs(y = "Cases per capita", x = "Date", color = "dif in % leaving home relative to pre SIP")

Bucket by case growth and look at movement

Looking back at the original case growth over time by zip codes, it looks like there are generally two groups–one set of zip codes that have >0.003 cases per capita, and one set that have <0.002 cases per capita by May 18th. The first set have rapid growths, while those in the second set generally have less if any growth. Let’s try breaking it up by cases.

sf_cases_zip_bucketed <- sf_cases_zip_edit %>% 
  filter(date == max(date)) %>%
  mutate(bucketed_case_level = ifelse(cases_by_pop <=0.002, "< 0.002", "> 0.002"))

sf_sd_zip_cases_bucket <- left_join(sf_cases_zip_bucketed %>% dplyr::select(-date), sf_sd_zip_by_date)

sf_sd_zip_cases_bucket_movement <- sf_sd_zip_cases_bucket %>% filter(!is.na(bucketed_case_level)) %>%
  group_by(bucketed_case_level, date) %>%
  summarize(total_at_home_bucket = sum(total_at_home_zip), total_devices = sum(total_devices)) %>% 
  mutate(percent_at_home = total_at_home_bucket*100/total_devices, percent_leaving_home = (100 - percent_at_home))

sf_sd_zip_cases_bucket_movement %>% filter(!is.na(bucketed_case_level)) %>%  ggplot(aes(x = date)) + 
  geom_line(aes(y = percent_leaving_home, color=bucketed_case_level)) + ggtitle('% leaving home over time grouped by cases per capita') + labs(y = "% leaving home", x = "Date", color = "cases per capita by May 18th")

fig_sf_movt_case_bucket <- sf_sd_zip_cases_bucket_movement %>% filter(!is.na(bucketed_case_level)) %>% 
  plot_ly() %>%
  add_lines(x = ~date, y = ~percent_leaving_home, color = ~bucketed_case_level) %>%
  layout(xaxis = list(title = 'Date'), yaxis = list(title = '% leaving home'), legend = list(title = list(text = "Cases per capita by May 18th")), title = "% leaving home over time grouped by cases per capita")

fig_sf_movt_case_bucket

Also pretty weird. Try looking just at after shelter in place, and the difference relative to before.

# try looking just after shelter in place
# get pre shelter in place averages
sf_sd_zip_cases_bucket_movement_pre_avg <- sf_sd_zip_cases_bucket_movement %>% filter(date < pre_sd_date_cutoff) %>% ungroup() %>%
  group_by(bucketed_case_level) %>%
  summarize(total_at_home_pre = sum(total_at_home_bucket), total_devices = sum(total_devices)) %>%
  mutate(percent_at_home= total_at_home_pre*100/total_devices, percent_leaving_home_pre = (100-percent_at_home))

sf_sd_zip_cases_bucket_movement_post <- sf_sd_zip_cases_bucket_movement %>% filter(date >= shelter_date) %>% 
  left_join(sf_sd_zip_cases_bucket_movement_pre_avg %>% dplyr::select(bucketed_case_level, percent_leaving_home_pre)) %>%
  mutate(dif_percent_leaving = percent_leaving_home - percent_leaving_home_pre)

sf_sd_zip_cases_bucket_movement_post %>% filter(!is.na(bucketed_case_level)) %>% ggplot(aes(x = date)) + 
  geom_line(aes(y = dif_percent_leaving, color=bucketed_case_level)) + ggtitle('% leaving home over time (relative to pre SIP) grouped by cases per capita') + labs(y = "difference in % leaving home", x = "Date", color = "cases per capita by May 18th")

fig_sf_movt_case_bucket_post <- sf_sd_zip_cases_bucket_movement_post %>% filter(!is.na(bucketed_case_level)) %>% 
  plot_ly() %>%
  add_lines(x = ~date, y = ~dif_percent_leaving, color = ~bucketed_case_level) %>%
  layout(xaxis = list(title = 'Date'), yaxis = list(title = 'difference in % leaving home'), legend = list(title = list(text = "Cases per capita by May 18th")), title = "% leaving home over time (relative to pre SIP) grouped by cases per capita")

fig_sf_movt_case_bucket_post

That does look more like what we would expect.

Santa Clara County case data

First just visualize what the case data looks like.

scc_cases_zip <- scc_map_cases %>% 
  dplyr::select(Zipcode, Cases, Population) %>% 
  st_drop_geometry() %>%
  rename(zipcode = Zipcode, cases = Cases, pop = Population) %>%
  mutate(cases = replace_na(cases, 0)) %>%
  mutate(cases_by_pop = cases/pop) %>%
  filter(is.numeric(cases_by_pop))

scc_cases_zip %>% ggplot(aes(x = zipcode, y = cases_by_pop)) + geom_point()

Visualize zip code level leaving home data for SCC.

# get leaving home data on zip code level
scc_sd_zip_by_date <- bay_sd %>%
  filter(origin_census_block_group %in% scc_blockgroups$GEOID) %>%
  left_join(scc_bg_zctas %>% dplyr::select(blockgroup, zipcode), by = c("origin_census_block_group" = "blockgroup")) %>% 
  group_by(zipcode, date) %>%
  summarize(total_at_home_zip = sum(completely_home_device_count), total_devices = sum(device_count)) %>% 
  mutate(
    percent_at_home = total_at_home_zip*100/total_devices, 
    percent_leaving_home = (100 - percent_at_home)
  )

scc_sd_zip_by_date %>% ggplot(aes(x = date)) + geom_line(aes(y = percent_leaving_home, color = zipcode))

Try first looking at the overall % leaving home since SIP and cases as of pulled on May 20th.

# get zip code averages leaving home before SIP
scc_sd_zip_pre_avgs <- scc_sd_zip_by_date %>%
  filter(date < pre_sd_date_cutoff) %>%
  group_by(zipcode) %>%
  summarize(total_at_home_pre = sum(total_at_home_zip), total_devices = sum(total_devices)) %>%
  mutate(percent_at_home_pre = total_at_home_pre*100/total_devices, percent_leaving_home_pre = (100 - percent_at_home_pre))

scc_sd_zip_post <- scc_sd_zip_by_date %>%
  filter(date >= shelter_date) %>%
  group_by(zipcode) %>%
  summarize(total_at_home = sum(total_at_home_zip), total_devices = sum(total_devices)) %>%
  mutate(percent_at_home = total_at_home*100/total_devices, percent_leaving_home = (100 - percent_at_home)) %>%
  left_join(scc_sd_zip_pre_avgs %>% dplyr::select(zipcode, percent_leaving_home_pre)) %>%
  mutate(dif_percent_leaving = percent_at_home - percent_leaving_home_pre) %>%
  left_join(scc_cases_zip)

# absolute percent leaving
scc_sd_zip_post %>% ggplot(
  aes(x = percent_leaving_home, y = cases_by_pop)) + geom_point() + ggtitle("Santa Clara County") + geom_smooth(method=lm)

scc_zip_cases_sd_cor_abs <- lm(cases_by_pop ~ percent_leaving_home, scc_sd_zip_post , na.action = na.omit)

summary(scc_zip_cases_sd_cor_abs)
## 
## Call:
## lm(formula = cases_by_pop ~ percent_leaving_home, data = scc_sd_zip_post, 
##     na.action = na.omit)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0013749 -0.0004338 -0.0001102  0.0003646  0.0031726 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)
## (Intercept)          4.492e-04  9.069e-04   0.495    0.622
## percent_leaving_home 1.338e-05  1.816e-05   0.737    0.465
## 
## Residual standard error: 0.0008889 on 52 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.01033,    Adjusted R-squared:  -0.008698 
## F-statistic: 0.543 on 1 and 52 DF,  p-value: 0.4645
# try with difference in percent leaving
scc_sd_zip_post %>% ggplot(
  aes(x = dif_percent_leaving, y = cases_by_pop)) + geom_point() + ggtitle("Santa Clara County") + geom_smooth(method=lm)

scc_zip_cases_sd_cor <- lm(cases_by_pop ~ dif_percent_leaving, scc_sd_zip_post , na.action = na.omit)

summary(scc_zip_cases_sd_cor)
## 
## Call:
## lm(formula = cases_by_pop ~ dif_percent_leaving, data = scc_sd_zip_post, 
##     na.action = na.omit)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0013273 -0.0004665 -0.0001358  0.0003810  0.0031728 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)  
## (Intercept)          9.088e-04  4.123e-04   2.205   0.0319 *
## dif_percent_leaving -7.837e-06  1.524e-05  -0.514   0.6092  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0008913 on 52 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.005061,   Adjusted R-squared:  -0.01407 
## F-statistic: 0.2645 on 1 and 52 DF,  p-value: 0.6092

Bucketing cases

Try bucketing by cases per capita.

# bucket case counts
scc_cases_bucketed <- scc_cases_zip %>%
  mutate(bucketed_cases_by_pop = ifelse(cases_by_pop > 0.002, "0.002 or more", ifelse(cases_by_pop > 0.0015, "0.0015 to 0.002", ifelse(cases_by_pop > 0.001, "0.001 to 0.0015", ifelse(cases_by_pop > 0, "0.00001 to 0.001", "0")))))

scc_sd_zip_by_date_case_bucket <- left_join(scc_cases_bucketed, scc_sd_zip_by_date) %>%
  filter(!is.na(bucketed_cases_by_pop) & !is.na(date)) %>%
  group_by(date, bucketed_cases_by_pop) %>%
  summarize(total_at_home_bucket = sum(total_at_home_zip), total_devices = sum(total_devices)) %>% 
  mutate(percent_at_home = total_at_home_bucket*100/total_devices, percent_leaving_home = (100 - percent_at_home)) %>% ungroup()

fig_scc_movt_case_bucket <- plot_ly(scc_sd_zip_by_date_case_bucket) %>%
  add_trace(x = ~date, y = ~percent_leaving_home, color = ~bucketed_cases_by_pop, type = 'scatter', mode = 'lines') %>%
  layout(xaxis = list(title = 'Date'), yaxis = list(title = '% leaving home'), legend = list(title = list(text = "Cases per capita")), title = "% leaving home over time grouped by cases per capita, Santa Clara County")

fig_scc_movt_case_bucket
# try looking at this in scatter plot form
scc_sd_zip_by_date_case_bucket_pre <- scc_sd_zip_by_date_case_bucket %>%
  filter(date < pre_sd_date_cutoff) %>%
  group_by(bucketed_cases_by_pop) %>%
  summarize(total_at_home_bucket = sum(total_at_home_bucket), total_devices = sum(total_devices)) %>% 
  mutate(percent_at_home_pre = total_at_home_bucket*100/total_devices, percent_leaving_home_pre = (100 - percent_at_home_pre)) %>% ungroup()

# join with pre data
scc_sd_zip_by_date_case_bucket_post <- scc_sd_zip_by_date_case_bucket %>%
  filter(date >= shelter_date) %>%
  group_by(bucketed_cases_by_pop) %>%
  summarize(total_at_home_bucket = sum(total_at_home_bucket), total_devices = sum(total_devices)) %>% 
  mutate(percent_at_home = total_at_home_bucket*100/total_devices, percent_leaving_home = (100 - percent_at_home)) %>% ungroup() %>%
  left_join(scc_sd_zip_by_date_case_bucket_pre %>% dplyr::select(bucketed_cases_by_pop, percent_leaving_home_pre)) %>%
  mutate(dif_percent_leaving = percent_leaving_home - percent_leaving_home_pre)

# plot - absolute metric
scc_sd_zip_by_date_case_bucket_post %>% ggplot(aes(x = percent_leaving_home, y = bucketed_cases_by_pop)) + geom_point() + ggtitle('Percent leaving home and cases per capita for buckets of cases per capita, Santa Clara')

# plot - relative metric
scc_sd_zip_by_date_case_bucket_post %>% ggplot(aes(x = dif_percent_leaving, y = bucketed_cases_by_pop)) + geom_point() + ggtitle('Percent leaving home and cases per capita for buckets of cases per capita, Santa Clara')

That generally looks as we’d expect it to–except for the zip codes with zero cases, but it seems like a fair hypothesis that something different might be going on in those counties anyway.

Bucketing movement

Now let’s look at bucketing movement into categories, like we did for SF, but here we can only look at the outcome of cases at a specific date not over time.

To start, just looking at bucketing overall movement since SIP.

ggplot(scc_sd_zip_post, aes(x = zipcode, y = percent_leaving_home)) + geom_point()

ggplot(scc_sd_zip_post, aes(x = zipcode, y = dif_percent_leaving)) + geom_point()

scc_sd_zip_post_bucketed <- scc_sd_zip_post %>%
  mutate(bucketed_percent_leaving = ifelse(percent_leaving_home > 60, "60% or more", ifelse(percent_leaving_home > 55, "55% to 60%", ifelse(percent_leaving_home > 50, "50% to 55%", ifelse(percent_leaving_home > 45, "45% to 50%", ifelse(percent_leaving_home > 40, "40% to 45%", "40% or less"))))), bucketed_dif_leaving = ifelse(dif_percent_leaving > 0, "0% or above", ifelse(dif_percent_leaving > -20, "0% to -20%", ifelse(dif_percent_leaving > -25, "-20% to -25%", ifelse(dif_percent_leaving > -30, "-25% to -30%", ifelse(dif_percent_leaving > -35, "-30% to -35%", ifelse(dif_percent_leaving > -40, "-35% to -40%", "-40% or lower"))))))) %>%
  left_join(scc_cases_zip) %>%
  filter(!is.na(cases))

scc_sd_cases_post_bucketed_abs <- scc_sd_zip_post_bucketed %>% 
  group_by(bucketed_percent_leaving) %>%
  summarize(total_cases = sum(cases), total_pop = sum(pop)) %>%
  mutate(cases_by_pop_grouped = total_cases/total_pop)

scc_sd_cases_post_bucketed_abs %>% ggplot(aes(x = bucketed_percent_leaving, y = cases_by_pop_grouped)) + geom_point() + ggtitle('Santa Clara')

scc_sd_cases_post_bucketed_rel <- scc_sd_zip_post_bucketed %>% 
  group_by(bucketed_dif_leaving) %>%
  summarize(total_cases = sum(cases), total_pop = sum(pop)) %>%
  mutate(cases_by_pop_grouped = total_cases/total_pop)

scc_sd_cases_post_bucketed_rel %>% ggplot(aes(x = bucketed_dif_leaving, y = cases_by_pop_grouped)) + geom_point() + ggtitle('Santa Clara')

Wow! Trends that (generally) make sense!